libname hold 'C:\andrew\Stats 747\2008';

** import cafe data;

PROC IMPORT OUT= work.cafe 
            DATAFILE= "C:\andrew\Stats 747\2008\cafe.csv" 
            DBMS=CSV REPLACE;
     GETNAMES=YES;
     DATAROW=2; 
RUN;


proc contents data = cafe;
run;

** check data & clean;
proc freq data = cafe;
	table 
q1
q2
q3
q7
q8
q4a1-q4a8
q5a q5b
q6a_1-q6a_10;
run;

** clean this data;

** temp dats set;

data cafe2;
  set cafe;
  label
    q6a_1="6(a) Convenient Hours"
    q6a_2="6(b) Speed of Service"
    q6a_3="6(c) Value for Money"
    q6a_4="6(d) Employee Friendliness"
    q6a_5="6(e) Cleanliness of the facility"
    q6a_6="6(f) Selection of Food"
    q6a_7="6(g) Appearance of Food"
    q6a_8="6(h) Freshness of Food"
    q6a_9="6(i) Healthy Choices"
    q6a_10="6(j) Availability of Nutritional Information"
    q7 = "Age"
    q8 = "Gender"
 ;
  if q1>13 then q1=13; * max # coffees =13; 
  array q6 q6a_1-q6a_10; * replace 8's with missings;
  do over q6;
    if q6=8 then q6=6;
  end;
run;

** check changes;

proc freq data = cafe2;
	table 
q1
q6a_1-q6a_10;
run;




** k means cluster - do this many times and find the most most
usefukl solution - change random seed (below it's 456);

proc fastclus data=cafe2 maxc=3 replace=random random=456 out=clusters;
   var q6a_1-q6a_10;
run;

** investigate;

proc sort data =clusters;
	by cluster;
proc means data =clusters mean;
	by cluster;
	var q6a_1-q6a_10; 
run;


** hmm perhaps easier to see if newq6 is binary for high value;


data clusters2;
	set clusters;
	array q6 q6a_1-q6a_10;
	array binq6 binq6a_1-binq6a_10;
	do i = 1 to 10;
		binq6(i)=0;
		if q6(i)<=3 then binq6(i)=1;
	end;
	drop i;
run;


*check;
proc freq data =clusters2;
	table 
	q6a_1  binq6a_1
	q6a_2  binq6a_2
	q6a_3  binq6a_3
	q6a_4  binq6a_4
	q6a_5  binq6a_5
	q6a_6  binq6a_6
	q6a_7  binq6a_7
	q6a_8  binq6a_8
	q6a_9  binq6a_9
	q6a_10  binq6a_10
;
run;


data clusters2;
  set clusters2;
  label
    binq6a_1="6(a) Convenient Hours"
    binq6a_2="6(b) Speed of Service"
    binq6a_3="6(c) Value for Money"
    binq6a_4="6(d) Employee Friendliness"
    binq6a_5="6(e) Cleanliness of the facility"
    binq6a_6="6(f) Selection of Food"
    binq6a_7="6(g) Appearance of Food"
    binq6a_8="6(h) Freshness of Food"
    binq6a_9="6(i) Healthy Choices"
    binq6a_10="6(j) Availability of Nutritional Information"
    ;
run;


proc means data =clusters2 mean;
	class cluster;
	var binq6a_1-binq6a_10; 
run;

** good for differentiating clusters by solution;
proc tabulate data = clusters2;
	class cluster;
	var binq6a_1-binq6a_10;
	table  
	binq6a_1 binq6a_2
	binq6a_3 binq6a_4
	binq6a_5 binq6a_6
	binq6a_7 binq6a_8
	binq6a_9 binq6a_10, MEAN*cluster; 
run;

** original data;
proc tabulate data = clusters2;
	class cluster;
	var q6a_1-q6a_10;
	table  
	q6a_1 q6a_2
	q6a_3 q6a_4
	q6a_5 q6a_6
	q6a_7 q6a_8
	q6a_9 q6a_10, MEAN*cluster; 
run;

proc tabulate data = clusters2;
	class cluster;
	var q1 q2	q3	q7	q8
	q4a1-q4a8
	q5a q5b;
	table
     q1 q2	q3	q7	q8
	q4a1 q4a2 q4a3 q4a4  q4a5 q4a6 q4a7 q4a8
	q5a q5b,MEAN*cluster; 
run;

** discriminate these segments;
** note q4a3-8 are all zeroes;
proc discrim data =clusters2;
	class cluster;
	var q1
	q2
	q3
	q7
	q8
	q4a1-q4a2 
	q5a q5b;
run;